"""
Text Cleaning Utilities

Provides functions for cleaning text with encoding issues.
"""

from loguru import logger


def remove_surrogates(text: str) -> str:
    """
    Remove surrogate characters that can't be encoded in UTF-8.

    PDF extraction and other text extraction methods can produce malformed Unicode
    with surrogate characters. This function ensures the text is safe for UTF-8 encoding.

    Args:
        text: Raw text that may contain surrogate characters

    Returns:
        Cleaned text safe for UTF-8 encoding
    """
    if not text:
        return text

    try:
        # Encode with surrogatepass to handle surrogates, then decode with replace
        # to convert any remaining issues to replacement characters
        return text.encode("utf-8", errors="surrogatepass").decode(
            "utf-8", errors="replace"
        )
    except Exception as e:
        logger.warning(
            f"Error cleaning text with surrogatepass: {e}, using fallback"
        )
        # Fallback: ignore any characters that can't be encoded
        return text.encode("utf-8", errors="ignore").decode(
            "utf-8", errors="ignore"
        )
